Finding Common Origins of Milky Way Stars

Author

Andersen Chang, Tiffany M. Tang, Tarek M. Zikry, Genevera I. Allen

Published

May 30, 2025

Fit Clustering Pipelines on Training Data

Show Code to Fit Clustering Pipelines on Training Data
## this code chunk fits the clustering pipelines on the training data

#### choose imputation methods ####
data_ls <- list(
  "Mean-imputed" = data_mean_imputed$train,
  "RF-imputed" = data_rf_imputed$train
)

#### choose number of features ####
feature_modes <- list(
  "Small" = 7,
  "Medium" = 11,
  "Big" = 19
)

#### choose ks ####
ks <- 2:30

#### choose dimension reduction methods ####
# raw data
identity_fun_ls <- list("Raw" = function(x) x)

# pca
pca_fun_ls <- list("PCA" = purrr::partial(fit_pca, ndim = 4))

# tsne
tsne_perplexities <- c(30, 100)
tsne_fun_ls <- purrr::map(
  tsne_perplexities,
  ~ purrr::partial(fit_tsne, dims = 2, perplexity = .x)
) |> 
  setNames(sprintf("tSNE (perplexity = %d)", tsne_perplexities))

# putting it together
dr_fun_ls <- c(
  identity_fun_ls,
  pca_fun_ls,
  tsne_fun_ls
)

#### choose clustering methods ####
# kmeans
kmeans_fun_ls <- list("K-means" = purrr::partial(fit_kmeans, ks = ks))

# hierarchical clustering
hclust_params <- list(
  d = c("euclidean"),
  linkage = c("complete", "ward.D")
) |> 
  expand.grid()
hclust_fun_ls <- purrr::map(
  1:nrow(hclust_params),
  ~ purrr::partial(
    fit_hclust, 
    ks = ks,
    d = hclust_params$d[[.x]],
    linkage = hclust_params$linkage[[.x]]
  )
) |> 
  setNames(
    sprintf(
      "Hierarchical (dist = %s, link = %s)",
      hclust_params$d, hclust_params$linkage
    )
  )

# spectral clustering
n_neighbors <- c(5, 30, 60, 100)
spectral_fun_ls <- purrr::map(
  n_neighbors,
  ~ purrr::partial(
    fit_spectral_clustering, 
    ks = ks,
    affinity = "nearest_neighbors",
    n_neighbors = .x
  )
) |> 
  setNames(sprintf("Spectral (n_neighbors = %s)", n_neighbors))

# putting it together
clust_fun_ls <- c(
  kmeans_fun_ls,
  hclust_fun_ls,
  spectral_fun_ls
)

#### Fit Clustering Pipelines ####
pipe_tib <- tidyr::expand_grid(
  data = data_ls,
  feature_mode = feature_modes,
  dr_method = dr_fun_ls,
  clust_method = clust_fun_ls
) |> 
  dplyr::mutate(
    impute_mode_name = names(data),
    feature_mode_name = names(feature_mode),
    dr_method_name = names(dr_method),
    clust_method_name = names(clust_method),
    name = stringr::str_glue(
      "{clust_method_name} [{impute_mode_name} + {feature_mode_name} + {dr_method_name}]"
    )
  ) |> 
  # remove some clustering pipelines to reduce computation burden
  dplyr::filter(
    # remove all big feature set + dimension-reduction runs
    !((dr_method_name != "Raw") & (feature_mode_name == "Big"))
  )
pipe_ls <- split(pipe_tib, seq_len(nrow(pipe_tib))) |> 
  setNames(pipe_tib$name)

fit_results_fname <- file.path(RESULTS_PATH, "clustering_fits_train.rds")
if (!file.exists(fit_results_fname)) {
  library(future)
  plan(multisession, workers = NCORES)
  
  # fit clustering pipelines (if not already cached)
  clust_fit_ls <- furrr::future_map(
    pipe_ls,
    function(pipe_df) {
      g <- create_preprocessing_pipeline(
        feature_mode = pipe_df$feature_mode[[1]],
        preprocess_fun = pipe_df$dr_method[[1]]
      )
      clust_out <- pipe_df$clust_method[[1]](
        data = pipe_df$data[[1]], preprocess_fun = g
      )
      return(clust_out)
    },
    .options = furrr::furrr_options(
      seed = TRUE, 
      globals = list(
        ks = ks,
        create_preprocessing_pipeline = create_preprocessing_pipeline,
        get_abundance_data = get_abundance_data,
        tsne_perplexities = tsne_perplexities,
        hclust_params = hclust_params,
        n_neighbors = n_neighbors,
        fit_kmeans = fit_kmeans,
        fit_hclust = fit_hclust,
        fit_spectral_clustering = fit_spectral_clustering
      )
    )
  )
  # save fitted clustering pipelines
  saveRDS(clust_fit_ls, file = fit_results_fname)
} else {
  # read in fitted clustering pipelines (if already cached)
  clust_fit_ls <- readRDS(fit_results_fname)
}

To see and interact with clustering results on training data, we have developed a shiny app. Run locally in R/RStudio:

view_clusters_app()

Hyperparameter Tuning and Model Selection via Cluster Stability

Show Code to Tune/Evaluate Clustering Pipelines via Cluster Stability
## this code chunk performs model selection and hyperparameter tuning for the trained clustering pipelines

#### ModelExplorer hyperparameters ####
n_subsamples <- 50 # Number of subsamples
subsample_frac <- 0.8 # Proportion of data to subsample
max_pairs <- 100  # max num of pairs of subsamples to evaluate cluster stability
metric <- "ARI"  # metrics to compute for cluster stability

n <- nrow(data_ls$`Mean-imputed`)
fit_results_fname <- file.path(RESULTS_PATH, "clustering_subsampled_fits.rds")
stability_results_fname <- file.path(
  RESULTS_PATH, sprintf("clustering_subsampled_stability_%s.RData", metric)
)
if (!file.exists(fit_results_fname) ||
    !file.exists(stability_results_fname)) {
  library(future)
  plan(multisession, workers = NCORES)

  # fit clustering pipelines on subsampled data (if not already cached)
  clust_subsampled_fit_ls <- purrr::map(
    1:n_subsamples,
    function(b) {
      # get subsampled indices
      subsampled_idxs <- sort(
        sample(1:n, size = subsample_frac * n, replace = FALSE)
      )
      furrr::future_map(
        pipe_ls,
        function(pipe_df) {
          # fit clustering pipeline on subsample
          g <- create_preprocessing_pipeline(
            feature_mode = pipe_df$feature_mode[[1]],
            preprocess_fun = pipe_df$dr_method[[1]]
          )
          clust_out <- pipe_df$clust_method[[1]](
            data = pipe_df$data[[1]][subsampled_idxs, ], preprocess_fun = g
          )
          # return only the cluster ids for subsample
          cluster_ids <- purrr::map(
            clust_out$cluster_ids,
            function(cluster_ids) {
              out <- rep(NA, nrow(pipe_df$data[[1]]))
              out[subsampled_idxs] <- cluster_ids
              return(out)
            }
          )
          return(cluster_ids)
        },
        .options = furrr::furrr_options(
          seed = TRUE, 
          globals = list(
            ks = ks,
            subsampled_idxs = subsampled_idxs,
            create_preprocessing_pipeline = create_preprocessing_pipeline,
            get_abundance_data = get_abundance_data,
            tsne_perplexities = tsne_perplexities,
            hclust_params = hclust_params,
            n_neighbors = n_neighbors,
            fit_kmeans = fit_kmeans,
            fit_hclust = fit_hclust,
            fit_spectral_clustering = fit_spectral_clustering
          )
        )
      )
    }
  ) |> 
    setNames(as.character(1:n_subsamples)) |> 
    purrr::list_flatten(name_spec = "{inner} {outer}") |> 
    purrr::list_flatten(name_spec = "{inner}: {outer}")
  # save fitted clustering pipelines on subsampled data
  saveRDS(clust_subsampled_fit_ls, file = fit_results_fname)
  
  # create tibble of clustering results
  clusters_tib <- tibble::tibble(
    name = names(clust_subsampled_fit_ls),
    cluster_ids = clust_subsampled_fit_ls
  ) |> 
    annotate_clustering_results()
  
  # evaluate stability of clustering results per pipeline
  stability_per_pipeline <- clusters_tib |> 
    dplyr::group_by(
      k, pipeline_name, 
      impute_mode_name, feature_mode_name, dr_method_name, clust_method_name
    ) |>
    dplyr::summarise(
      cluster_list = list(cluster_ids),
      .groups = "drop"
    )
  stability_vals <- furrr::future_map(
    stability_per_pipeline$cluster_list,
    function(clusters) {
      cluster_stability(
        clusters, max_pairs = max_pairs, metrics = metric
      )[[metric]]
    },
    .options = furrr::furrr_options(
      seed = TRUE, 
      globals = list(
        cluster_stability = cluster_stability,
        max_pairs = max_pairs,
        metric = metric
      )
    )
  )
  stability_per_pipeline <- stability_per_pipeline |> 
    dplyr::mutate(
      group_name = pipeline_name,
      stability = !!stability_vals
    )
  
  # evaluate stability of clustering results per clustering method
  stability_per_clust_method <- clusters_tib |> 
    dplyr::group_by(
      k, clust_method_name
    ) |>
    dplyr::summarise(
      pipeline_names = list(unique(pipeline_name)),
      cluster_list = list(cluster_ids),
      .groups = "drop"
    )
  stability_vals <- furrr::future_map2(
    stability_per_clust_method$cluster_list,
    stability_per_clust_method$pipeline_names,
    function(clusters, pipe_names) {
      cluster_stability(
        clusters, max_pairs = max_pairs * length(pipe_names), metrics = metric
      )[[metric]]
    },
    .options = furrr::furrr_options(
      seed = TRUE, 
      globals = list(
        cluster_stability = cluster_stability,
        max_pairs = max_pairs,
        metric = metric
      )
    )
  )
  stability_per_clust_method <- stability_per_clust_method |>
    dplyr::mutate(
      group_name = clust_method_name,
      stability = !!stability_vals
    )
  
  save(
    stability_per_pipeline, 
    stability_per_clust_method, 
    file = stability_results_fname
  )
} else {
  # read in fitted clustering pipelines on subsampled data (if already cached)
  clust_subsampled_fit_ls <- readRDS(fit_results_fname)
  # read in evaluated stability of clustering results
  load(stability_results_fname)
}

Stability of clustering pipelines across the number of clusters, measured via the adjusted Rand Index (ARI).

Stability of clustering pipelines across the number of clusters, measured via the adjusted Rand Index (ARI).

Estimate Consensus Clusters

Show Code to Estimate Consensus Clusters
best_clust_methods <- list(
  "k = 2: Spectral (n_neighbors = 60)" = list(
    clust_method_name = "Spectral (n_neighbors = 60)",
    k = 2
  ),
  "k = 9: Spectral (n_neighbors = 60)" = list(
    clust_method_name = "Spectral (n_neighbors = 60)",
    k = 9
  ),
  "k = 8: K-means" = list(
    clust_method_name = "K-means", 
    k = 8
  )
)
clust_fit_ls <- purrr::map(clust_fit_ls, ~ .x$cluster_ids) |> 
  purrr::list_flatten(name_spec = "{inner}: {outer}")

consensus_clusters_results_path <- file.path(
  RESULTS_PATH, "consensus_clusters_train.rds"
)
consensus_nbhd_results_path <- file.path(
  RESULTS_PATH, "consensus_neighborhood_matrices_train.rds"
)
if (!file.exists(consensus_clusters_results_path) | 
    !file.exists(consensus_nbhd_results_path)) {
  nbhd_mat_ls <- list()
  consensus_out_ls <- list()
  for (i in 1:length(best_clust_methods)) {
    clust_method_name <- best_clust_methods[[i]]$clust_method_name
    k <- best_clust_methods[[i]]$k
    key <- names(best_clust_methods)[i]
    
    stable_clusters_tib_all <- tibble::tibble(
      name = names(clust_fit_ls),
      cluster_ids = clust_fit_ls
    ) |> 
      annotate_clustering_results() |> 
      dplyr::filter(
        k == !!k,
        clust_method_name == !!clust_method_name
      )
  
    keep_clust_ls <- stable_clusters_tib_all$cluster_ids
    # compute neighborhood matrix
    nbhd_mat_ls[[key]] <- get_consensus_neighborhood_matrix(keep_clust_ls)
    # aggregate stable clusters using consensus clustering
    consensus_out_ls[[key]] <- fit_consensus_clusters(nbhd_mat_ls[[key]], k = k)["cluster_ids"]
  }
  saveRDS(consensus_out_ls, file = consensus_clusters_results_path)
  saveRDS(nbhd_mat_ls, file = consensus_nbhd_results_path)
} else {
  # read in fitted clustering pipelines (if already cached)
  consensus_out_ls <- readRDS(consensus_clusters_results_path)
  nbhd_mat_ls <- readRDS(consensus_nbhd_results_path)
}

Aside: What if…

What if we had used Spectral Clustering (n neighbors = 60) with k = 9?

Neighborhood Heatmap

Consensus neighborhood heatmap between consensus clusters using Spectral (n_neighbors = 60) with k = 9 and K-means with k = 8.

Consensus neighborhood heatmap between consensus clusters using Spectral (n_neighbors = 60) with k = 9 and K-means with k = 8.

GCs per Cluster

Comparison of GC composition for each estimated consensus cluster.

What if we had used only the most stable clustering pipeline and done consensus clustering on the subsampled fits?

Neighborhood Heatmap

Consensus neighborhood heatmap between consensus clusters using K-means with k = 8, aggregated across all data preprocessing pipelines versus using only the most stable clustering pipeline.

Consensus neighborhood heatmap between consensus clusters using K-means with k = 8, aggregated across all data preprocessing pipelines versus using only the most stable clustering pipeline.

GCs per Cluster

Comparison of GC composition for each estimated consensus cluster.